import scanpy as sc #software suite of tools for single-cell analysis in python
import besca as bc #internal BEDA package for single cell analysis
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
import scipy
import anndata as ad
from scipy.sparse import csr_matrix
import scanpy.external as sce
from harmony import harmonize
import umap.umap_ as umap
from matplotlib.pyplot import rc_context
import matplotlib_inline.backend_inline
import warnings
print(ad.__version__)
warnings.filterwarnings("ignore")
sc.settings.verbosity = 3 # verbosity: errors (0), warnings (1), info (2), hints (3)
INFO:torch.distributed.nn.jit.instantiator:Created a temporary directory at /tmp/tmpuz2ta72v INFO:torch.distributed.nn.jit.instantiator:Writing /tmp/tmpuz2ta72v/_remote_module_non_scriptable.py INFO:lightning_fabric.utilities.seed:Global seed set to 0
0.9.1
Displaying result settings required for single cell analysis
sc.logging.print_header()
sc.settings.set_figure_params(dpi=80, facecolor='white')
scanpy==1.9.3 anndata==0.9.1 umap==0.3.10 numpy==1.22.4 scipy==1.10.1 pandas==1.5.3 scikit-learn==1.2.2 statsmodels==0.14.0 python-igraph==0.10.4 louvain==0.8.0 pynndescent==0.5.10
#Reading last saved annoatated data object written in h5ad data format.
#We used similar adata variable to make similar previous data analysis
# List of file paths
file_paths = [
'/home/jana/balsarc1.h5ad',
'/home/jana/balsarc2.h5ad',
'/home/jana/balsarc3.h5ad',
'/home/jana/balhealth1.h5ad',
'/home/jana/balhealth2.h5ad',
'/home/jana/balhealthy3.h5ad',
'/home/jana/balhealthy4.h5ad',
'/home/jana/balhealthy5.h5ad',
'/home/jana/balhealthy6.h5ad',
'/home/jana/balhealthy7.h5ad',
'/home/jana/balhealthy8.h5ad',
'/home/jana/balhealthy9.h5ad',
'/home/jana/balhealthy10.h5ad'
]
# List to store loaded data objects
data_objects = []
# Loop to read h5ad files and store data objects
for file_path in file_paths:
data_objects.append(sc.read_h5ad(file_path))
# Unpack data objects to individual variables
balsarc1, balsarc2, balsarc3, balhealthy1, balhealthy2, balhealthy3, balhealthy4, balhealthy5, balhealthy6, balhealthy7, balhealthy8, balhealthy9, balhealthy10 = data_objects
Cell cycle Genes Finding
# Read cell cycle genes from file
with open('./data/regev_lab_cell_cycle_genes.txt') as file:
cell_cycle_genes = [x.strip() for x in file]
#print(len(cell_cycle_genes))
print("In the regev-lab dataset cell cycle genes lenghth:", len(cell_cycle_genes))
# Split into S phase and G2/M phase gene lists
s_genes = cell_cycle_genes[:43]
g2m_genes = cell_cycle_genes[43:]
# Filter cell cycle genes based on availability in different datasets
cell_cycle_genes = [x for x in cell_cycle_genes if x in balsarc1.var_names]
print("Sample balsarc1 cell cycle genes lenghth:", len(cell_cycle_genes))
cell_cycle_genes = [x for x in cell_cycle_genes if x in balsarc2.var_names]
print("Sample balsarc2 cell cycle genes lenghth:", len(cell_cycle_genes))
cell_cycle_genes = [x for x in cell_cycle_genes if x in balsarc3.var_names]
print("Sample balsarc3 cell cycle genes lenghth:", len(cell_cycle_genes))
cell_cycle_genes = [x for x in cell_cycle_genes if x in balhealthy1.var_names]
print("Sample balhealthy1 cell cycle genes lenghth:", len(cell_cycle_genes))
cell_cycle_genes = [x for x in cell_cycle_genes if x in balhealthy2.var_names]
print("Sample balhealthy2 cell cycle genes lenghth:", len(cell_cycle_genes))
cell_cycle_genes = [x for x in cell_cycle_genes if x in balhealthy3.var_names]
print("Sample balhealthy3 cell cycle genes lenghth:", len(cell_cycle_genes))
cell_cycle_genes = [x for x in cell_cycle_genes if x in balhealthy4.var_names]
print("Sample balhealthy4 cell cycle genes lenghth:", len(cell_cycle_genes))
cell_cycle_genes = [x for x in cell_cycle_genes if x in balhealthy5.var_names]
print("Sample balhealthy5 cell cycle genes lenghth:", len(cell_cycle_genes))
cell_cycle_genes = [x for x in cell_cycle_genes if x in balhealthy6.var_names]
print("Sample balhealthy6 cell cycle genes lenghth:", len(cell_cycle_genes))
cell_cycle_genes = [x for x in cell_cycle_genes if x in balhealthy7.var_names]
print("Sample balhealthy7 cell cycle genes lenghth:", len(cell_cycle_genes))
cell_cycle_genes = [x for x in cell_cycle_genes if x in balhealthy8.var_names]
print("Sample balhealthy8 cell cycle genes lenghth:", len(cell_cycle_genes))
cell_cycle_genes = [x for x in cell_cycle_genes if x in balhealthy9.var_names]
print("Sample balhealthy9 cell cycle genes lenghth:", len(cell_cycle_genes))
cell_cycle_genes = [x for x in cell_cycle_genes if x in balhealthy10.var_names]
print("Sample balhealthy10 cell cycle genes lenghth:", len(cell_cycle_genes))
In the regev-lab dataset cell cycle genes lenghth: 97 Sample balsarc1 cell cycle genes lenghth: 94 Sample balsarc2 cell cycle genes lenghth: 94 Sample balsarc3 cell cycle genes lenghth: 94 Sample balhealthy1 cell cycle genes lenghth: 94 Sample balhealthy2 cell cycle genes lenghth: 94 Sample balhealthy3 cell cycle genes lenghth: 94 Sample balhealthy4 cell cycle genes lenghth: 94 Sample balhealthy5 cell cycle genes lenghth: 94 Sample balhealthy6 cell cycle genes lenghth: 94 Sample balhealthy7 cell cycle genes lenghth: 94 Sample balhealthy8 cell cycle genes lenghth: 94 Sample balhealthy9 cell cycle genes lenghth: 94 Sample balhealthy10 cell cycle genes lenghth: 94
Caculating score_genes_cell_cycle: s_genes and g2m_genes
adata_list = [balsarc1, balsarc2, balsarc3, balhealthy1, balhealthy2, balhealthy3, balhealthy4, balhealthy5, balhealthy6, balhealthy7, balhealthy8, balhealthy9, balhealthy10]
for adata in adata_list:
sc.tl.score_genes_cell_cycle(adata, s_genes=s_genes, g2m_genes=g2m_genes)
calculating cell cycle phase
computing score 'S_score'
WARNING: genes are not in var_names and ignored: ['MLF1IP']
finished: added
'S_score', score of gene set (adata.obs).
555 total control genes are used. (0:00:00)
computing score 'G2M_score'
WARNING: genes are not in var_names and ignored: ['FAM64A', 'HN1']
finished: added
'G2M_score', score of gene set (adata.obs).
599 total control genes are used. (0:00:00)
--> 'phase', cell cycle phase (adata.obs)
calculating cell cycle phase
computing score 'S_score'
WARNING: genes are not in var_names and ignored: ['MLF1IP']
finished: added
'S_score', score of gene set (adata.obs).
558 total control genes are used. (0:00:00)
computing score 'G2M_score'
WARNING: genes are not in var_names and ignored: ['FAM64A', 'HN1']
finished: added
'G2M_score', score of gene set (adata.obs).
643 total control genes are used. (0:00:00)
--> 'phase', cell cycle phase (adata.obs)
calculating cell cycle phase
computing score 'S_score'
WARNING: genes are not in var_names and ignored: ['MLF1IP']
finished: added
'S_score', score of gene set (adata.obs).
643 total control genes are used. (0:00:00)
computing score 'G2M_score'
WARNING: genes are not in var_names and ignored: ['FAM64A', 'HN1']
finished: added
'G2M_score', score of gene set (adata.obs).
728 total control genes are used. (0:00:00)
--> 'phase', cell cycle phase (adata.obs)
calculating cell cycle phase
computing score 'S_score'
WARNING: genes are not in var_names and ignored: ['MLF1IP']
finished: added
'S_score', score of gene set (adata.obs).
644 total control genes are used. (0:00:00)
computing score 'G2M_score'
WARNING: genes are not in var_names and ignored: ['FAM64A', 'HN1']
finished: added
'G2M_score', score of gene set (adata.obs).
686 total control genes are used. (0:00:00)
--> 'phase', cell cycle phase (adata.obs)
calculating cell cycle phase
computing score 'S_score'
WARNING: genes are not in var_names and ignored: ['MLF1IP']
finished: added
'S_score', score of gene set (adata.obs).
514 total control genes are used. (0:00:00)
computing score 'G2M_score'
WARNING: genes are not in var_names and ignored: ['FAM64A', 'HN1']
finished: added
'G2M_score', score of gene set (adata.obs).
600 total control genes are used. (0:00:00)
--> 'phase', cell cycle phase (adata.obs)
calculating cell cycle phase
computing score 'S_score'
WARNING: genes are not in var_names and ignored: ['MLF1IP']
finished: added
'S_score', score of gene set (adata.obs).
600 total control genes are used. (0:00:00)
computing score 'G2M_score'
WARNING: genes are not in var_names and ignored: ['FAM64A', 'HN1']
finished: added
'G2M_score', score of gene set (adata.obs).
600 total control genes are used. (0:00:00)
--> 'phase', cell cycle phase (adata.obs)
calculating cell cycle phase
computing score 'S_score'
WARNING: genes are not in var_names and ignored: ['MLF1IP']
finished: added
'S_score', score of gene set (adata.obs).
639 total control genes are used. (0:00:00)
computing score 'G2M_score'
WARNING: genes are not in var_names and ignored: ['FAM64A', 'HN1']
finished: added
'G2M_score', score of gene set (adata.obs).
686 total control genes are used. (0:00:00)
--> 'phase', cell cycle phase (adata.obs)
calculating cell cycle phase
computing score 'S_score'
WARNING: genes are not in var_names and ignored: ['MLF1IP']
finished: added
'S_score', score of gene set (adata.obs).
601 total control genes are used. (0:00:00)
computing score 'G2M_score'
WARNING: genes are not in var_names and ignored: ['FAM64A', 'HN1']
finished: added
'G2M_score', score of gene set (adata.obs).
642 total control genes are used. (0:00:00)
--> 'phase', cell cycle phase (adata.obs)
calculating cell cycle phase
computing score 'S_score'
WARNING: genes are not in var_names and ignored: ['MLF1IP']
finished: added
'S_score', score of gene set (adata.obs).
642 total control genes are used. (0:00:00)
computing score 'G2M_score'
WARNING: genes are not in var_names and ignored: ['FAM64A', 'HN1']
finished: added
'G2M_score', score of gene set (adata.obs).
725 total control genes are used. (0:00:00)
--> 'phase', cell cycle phase (adata.obs)
calculating cell cycle phase
computing score 'S_score'
WARNING: genes are not in var_names and ignored: ['MLF1IP']
finished: added
'S_score', score of gene set (adata.obs).
602 total control genes are used. (0:00:00)
computing score 'G2M_score'
WARNING: genes are not in var_names and ignored: ['FAM64A', 'HN1']
finished: added
'G2M_score', score of gene set (adata.obs).
685 total control genes are used. (0:00:00)
--> 'phase', cell cycle phase (adata.obs)
calculating cell cycle phase
computing score 'S_score'
WARNING: genes are not in var_names and ignored: ['MLF1IP']
finished: added
'S_score', score of gene set (adata.obs).
598 total control genes are used. (0:00:00)
computing score 'G2M_score'
WARNING: genes are not in var_names and ignored: ['FAM64A', 'HN1']
finished: added
'G2M_score', score of gene set (adata.obs).
641 total control genes are used. (0:00:00)
--> 'phase', cell cycle phase (adata.obs)
calculating cell cycle phase
computing score 'S_score'
WARNING: genes are not in var_names and ignored: ['MLF1IP']
finished: added
'S_score', score of gene set (adata.obs).
774 total control genes are used. (0:00:00)
computing score 'G2M_score'
WARNING: genes are not in var_names and ignored: ['FAM64A', 'HN1']
finished: added
'G2M_score', score of gene set (adata.obs).
729 total control genes are used. (0:00:00)
--> 'phase', cell cycle phase (adata.obs)
calculating cell cycle phase
computing score 'S_score'
WARNING: genes are not in var_names and ignored: ['MLF1IP']
finished: added
'S_score', score of gene set (adata.obs).
642 total control genes are used. (0:00:00)
computing score 'G2M_score'
WARNING: genes are not in var_names and ignored: ['FAM64A', 'HN1']
finished: added
'G2M_score', score of gene set (adata.obs).
685 total control genes are used. (0:00:00)
--> 'phase', cell cycle phase (adata.obs)
Violin plot of 'S_score' and 'G2M_score' of all samples
adata_list = [balsarc1, balsarc2, balsarc3, balhealthy1, balhealthy2, balhealthy3, balhealthy4, balhealthy5, balhealthy6, balhealthy7, balhealthy8, balhealthy9, balhealthy10]
for adata in adata_list:
sc.pl.violin(adata, keys=['S_score', 'G2M_score'], groupby = 'sample', rotation=90)
... storing 'phase' as categorical
... storing 'phase' as categorical
... storing 'phase' as categorical
... storing 'phase' as categorical
... storing 'phase' as categorical
... storing 'phase' as categorical
... storing 'phase' as categorical
... storing 'phase' as categorical
... storing 'phase' as categorical
... storing 'phase' as categorical
... storing 'phase' as categorical
... storing 'phase' as categorical
... storing 'phase' as categorical
Display all samples annotated data
adata_list = [balsarc1, balsarc2, balsarc3, balhealthy1, balhealthy2, balhealthy3, balhealthy4, balhealthy5, balhealthy6, balhealthy7, balhealthy8, balhealthy9, balhealthy10]
for adata in adata_list:
print(adata)
AnnData object with n_obs × n_vars = 11012 × 19931
obs: 'type', 'sample', 'percent_chrY', 'XIST-counts', 'n_genes', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'total_counts_ribo', 'pct_counts_ribo', 'leiden', 'leiden_res0_20', 'leiden_res0_40', 'leiden_res0_60', 'leiden_res0_80', 'leiden_res1', 'S_score', 'G2M_score', 'phase'
var: 'gene_ids', 'feature_types', 'n_cells', 'mt', 'ribo', 'n_cells_by_counts', 'mean_counts', 'pct_dropout_by_counts', 'total_counts', 'highly_variable', 'means', 'dispersions', 'dispersions_norm', 'mean', 'std'
uns: 'hvg', 'leiden', 'leiden_res0_20_colors', 'leiden_res0_40_colors', 'leiden_res0_60_colors', 'leiden_res0_80_colors', 'leiden_res1_colors', 'log1p', 'neighbors', 'pca', 'sample_colors', 'umap'
obsm: 'X_pca', 'X_umap'
varm: 'PCs'
obsp: 'connectivities', 'distances'
AnnData object with n_obs × n_vars = 11241 × 20264
obs: 'type', 'sample', 'percent_chrY', 'XIST-counts', 'n_genes', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'total_counts_ribo', 'pct_counts_ribo', 'leiden', 'leiden_res0_20', 'leiden_res0_40', 'leiden_res0_60', 'leiden_res0_80', 'leiden_res1', 'S_score', 'G2M_score', 'phase'
var: 'gene_ids', 'feature_types', 'n_cells', 'mt', 'ribo', 'n_cells_by_counts', 'mean_counts', 'pct_dropout_by_counts', 'total_counts', 'highly_variable', 'means', 'dispersions', 'dispersions_norm', 'mean', 'std'
uns: 'hvg', 'leiden', 'leiden_res0_20_colors', 'leiden_res0_40_colors', 'leiden_res0_60_colors', 'leiden_res0_80_colors', 'leiden_res1_colors', 'log1p', 'neighbors', 'pca', 'sample_colors', 'umap'
obsm: 'X_pca', 'X_umap'
varm: 'PCs'
obsp: 'connectivities', 'distances'
AnnData object with n_obs × n_vars = 4547 × 20484
obs: 'type', 'sample', 'percent_chrY', 'XIST-counts', 'n_genes', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'total_counts_ribo', 'pct_counts_ribo', 'leiden', 'leiden_res0_20', 'leiden_res0_40', 'leiden_res0_60', 'leiden_res0_80', 'leiden_res1', 'S_score', 'G2M_score', 'phase'
var: 'gene_ids', 'feature_types', 'n_cells', 'mt', 'ribo', 'n_cells_by_counts', 'mean_counts', 'pct_dropout_by_counts', 'total_counts', 'highly_variable', 'means', 'dispersions', 'dispersions_norm', 'mean', 'std'
uns: 'hvg', 'leiden', 'leiden_res0_20_colors', 'leiden_res0_40_colors', 'leiden_res0_60_colors', 'leiden_res0_80_colors', 'leiden_res1_colors', 'log1p', 'neighbors', 'pca', 'sample_colors', 'umap'
obsm: 'X_pca', 'X_umap'
varm: 'PCs'
obsp: 'connectivities', 'distances'
AnnData object with n_obs × n_vars = 5607 × 19466
obs: 'type', 'sample', 'percent_chrY', 'XIST-counts', 'n_genes', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'total_counts_ribo', 'pct_counts_ribo', 'leiden', 'leiden_res0_20', 'leiden_res0_40', 'leiden_res0_60', 'leiden_res0_80', 'leiden_res1', 'S_score', 'G2M_score', 'phase'
var: 'gene_ids', 'n_cells', 'mt', 'ribo', 'n_cells_by_counts', 'mean_counts', 'pct_dropout_by_counts', 'total_counts', 'highly_variable', 'means', 'dispersions', 'dispersions_norm', 'mean', 'std'
uns: 'hvg', 'leiden', 'leiden_res0_20_colors', 'leiden_res0_40_colors', 'leiden_res0_60_colors', 'leiden_res0_80_colors', 'leiden_res1_colors', 'log1p', 'neighbors', 'pca', 'sample_colors', 'umap'
obsm: 'X_pca', 'X_umap'
varm: 'PCs'
obsp: 'connectivities', 'distances'
AnnData object with n_obs × n_vars = 5710 × 19525
obs: 'type', 'sample', 'percent_chrY', 'XIST-counts', 'n_genes', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'total_counts_ribo', 'pct_counts_ribo', 'leiden', 'leiden_res0_20', 'leiden_res0_40', 'leiden_res0_60', 'leiden_res0_80', 'leiden_res1', 'S_score', 'G2M_score', 'phase'
var: 'gene_ids', 'n_cells', 'mt', 'ribo', 'n_cells_by_counts', 'mean_counts', 'pct_dropout_by_counts', 'total_counts', 'highly_variable', 'means', 'dispersions', 'dispersions_norm', 'mean', 'std'
uns: 'hvg', 'leiden', 'leiden_res0_20_colors', 'leiden_res0_40_colors', 'leiden_res0_60_colors', 'leiden_res0_80_colors', 'leiden_res1_colors', 'log1p', 'neighbors', 'pca', 'sample_colors', 'umap'
obsm: 'X_pca', 'X_umap'
varm: 'PCs'
obsp: 'connectivities', 'distances'
AnnData object with n_obs × n_vars = 4336 × 19223
obs: 'type', 'sample', 'percent_chrY', 'XIST-counts', 'n_genes', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'total_counts_ribo', 'pct_counts_ribo', 'leiden', 'leiden_res0_20', 'leiden_res0_40', 'leiden_res0_60', 'leiden_res0_80', 'leiden_res1', 'S_score', 'G2M_score', 'phase'
var: 'gene_ids', 'n_cells', 'mt', 'ribo', 'n_cells_by_counts', 'mean_counts', 'pct_dropout_by_counts', 'total_counts', 'highly_variable', 'means', 'dispersions', 'dispersions_norm', 'mean', 'std'
uns: 'hvg', 'leiden', 'leiden_res0_20_colors', 'leiden_res0_40_colors', 'leiden_res0_60_colors', 'leiden_res0_80_colors', 'leiden_res1_colors', 'log1p', 'neighbors', 'pca', 'sample_colors', 'umap'
obsm: 'X_pca', 'X_umap'
varm: 'PCs'
obsp: 'connectivities', 'distances'
AnnData object with n_obs × n_vars = 4397 × 18328
obs: 'type', 'sample', 'percent_chrY', 'XIST-counts', 'n_genes', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'total_counts_ribo', 'pct_counts_ribo', 'leiden', 'leiden_res0_20', 'leiden_res0_40', 'leiden_res0_60', 'leiden_res0_80', 'leiden_res1', 'S_score', 'G2M_score', 'phase'
var: 'gene_ids', 'n_cells', 'mt', 'ribo', 'n_cells_by_counts', 'mean_counts', 'pct_dropout_by_counts', 'total_counts', 'highly_variable', 'means', 'dispersions', 'dispersions_norm', 'mean', 'std'
uns: 'hvg', 'leiden', 'leiden_res0_20_colors', 'leiden_res0_40_colors', 'leiden_res0_60_colors', 'leiden_res0_80_colors', 'leiden_res1_colors', 'log1p', 'neighbors', 'pca', 'sample_colors', 'umap'
obsm: 'X_pca', 'X_umap'
varm: 'PCs'
obsp: 'connectivities', 'distances'
AnnData object with n_obs × n_vars = 5220 × 19022
obs: 'type', 'sample', 'percent_chrY', 'XIST-counts', 'n_genes', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'total_counts_ribo', 'pct_counts_ribo', 'leiden', 'leiden_res0_20', 'leiden_res0_40', 'leiden_res0_60', 'leiden_res0_80', 'leiden_res1', 'S_score', 'G2M_score', 'phase'
var: 'gene_ids', 'n_cells', 'mt', 'ribo', 'n_cells_by_counts', 'mean_counts', 'pct_dropout_by_counts', 'total_counts', 'highly_variable', 'means', 'dispersions', 'dispersions_norm', 'mean', 'std'
uns: 'hvg', 'leiden', 'leiden_res0_20_colors', 'leiden_res0_40_colors', 'leiden_res0_60_colors', 'leiden_res0_80_colors', 'leiden_res1_colors', 'log1p', 'neighbors', 'pca', 'sample_colors', 'umap'
obsm: 'X_pca', 'X_umap'
varm: 'PCs'
obsp: 'connectivities', 'distances'
AnnData object with n_obs × n_vars = 4731 × 19113
obs: 'type', 'sample', 'percent_chrY', 'XIST-counts', 'n_genes', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'total_counts_ribo', 'pct_counts_ribo', 'leiden', 'leiden_res0_20', 'leiden_res0_40', 'leiden_res0_60', 'leiden_res0_80', 'leiden_res1', 'S_score', 'G2M_score', 'phase'
var: 'gene_ids', 'n_cells', 'mt', 'ribo', 'n_cells_by_counts', 'mean_counts', 'pct_dropout_by_counts', 'total_counts', 'highly_variable', 'means', 'dispersions', 'dispersions_norm', 'mean', 'std'
uns: 'hvg', 'leiden', 'leiden_res0_20_colors', 'leiden_res0_40_colors', 'leiden_res0_60_colors', 'leiden_res0_80_colors', 'leiden_res1_colors', 'log1p', 'neighbors', 'pca', 'sample_colors', 'umap'
obsm: 'X_pca', 'X_umap'
varm: 'PCs'
obsp: 'connectivities', 'distances'
AnnData object with n_obs × n_vars = 5327 × 18906
obs: 'type', 'sample', 'percent_chrY', 'XIST-counts', 'n_genes', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'total_counts_ribo', 'pct_counts_ribo', 'leiden', 'leiden_res0_20', 'leiden_res0_40', 'leiden_res0_60', 'leiden_res0_80', 'leiden_res1', 'S_score', 'G2M_score', 'phase'
var: 'gene_ids', 'n_cells', 'mt', 'ribo', 'n_cells_by_counts', 'mean_counts', 'pct_dropout_by_counts', 'total_counts', 'highly_variable', 'means', 'dispersions', 'dispersions_norm', 'mean', 'std'
uns: 'hvg', 'leiden', 'leiden_res0_20_colors', 'leiden_res0_40_colors', 'leiden_res0_60_colors', 'leiden_res0_80_colors', 'leiden_res1_colors', 'log1p', 'neighbors', 'pca', 'sample_colors', 'umap'
obsm: 'X_pca', 'X_umap'
varm: 'PCs'
obsp: 'connectivities', 'distances'
AnnData object with n_obs × n_vars = 6108 × 19110
obs: 'type', 'sample', 'percent_chrY', 'XIST-counts', 'n_genes', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'total_counts_ribo', 'pct_counts_ribo', 'leiden', 'leiden_res0_20', 'leiden_res0_40', 'leiden_res0_60', 'leiden_res0_80', 'leiden_res1', 'S_score', 'G2M_score', 'phase'
var: 'gene_ids', 'n_cells', 'mt', 'ribo', 'n_cells_by_counts', 'mean_counts', 'pct_dropout_by_counts', 'total_counts', 'highly_variable', 'means', 'dispersions', 'dispersions_norm', 'mean', 'std'
uns: 'hvg', 'leiden', 'leiden_res0_20_colors', 'leiden_res0_40_colors', 'leiden_res0_60_colors', 'leiden_res0_80_colors', 'leiden_res1_colors', 'log1p', 'neighbors', 'pca', 'sample_colors', 'umap'
obsm: 'X_pca', 'X_umap'
varm: 'PCs'
obsp: 'connectivities', 'distances'
AnnData object with n_obs × n_vars = 2438 × 14964
obs: 'type', 'sample', 'percent_chrY', 'XIST-counts', 'n_genes', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'total_counts_ribo', 'pct_counts_ribo', 'leiden', 'leiden_res0_20', 'leiden_res0_40', 'leiden_res0_60', 'leiden_res0_80', 'leiden_res1', 'S_score', 'G2M_score', 'phase'
var: 'gene_ids', 'n_cells', 'mt', 'ribo', 'n_cells_by_counts', 'mean_counts', 'pct_dropout_by_counts', 'total_counts', 'highly_variable', 'means', 'dispersions', 'dispersions_norm', 'mean', 'std'
uns: 'hvg', 'leiden', 'leiden_res0_20_colors', 'leiden_res0_40_colors', 'leiden_res0_60_colors', 'leiden_res0_80_colors', 'leiden_res1_colors', 'log1p', 'neighbors', 'pca', 'sample_colors', 'umap'
obsm: 'X_pca', 'X_umap'
varm: 'PCs'
obsp: 'connectivities', 'distances'
AnnData object with n_obs × n_vars = 3720 × 17524
obs: 'type', 'sample', 'percent_chrY', 'XIST-counts', 'n_genes', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'total_counts_ribo', 'pct_counts_ribo', 'leiden', 'leiden_res0_20', 'leiden_res0_40', 'leiden_res0_60', 'leiden_res0_80', 'leiden_res1', 'S_score', 'G2M_score', 'phase'
var: 'gene_ids', 'n_cells', 'mt', 'ribo', 'n_cells_by_counts', 'mean_counts', 'pct_dropout_by_counts', 'total_counts', 'highly_variable', 'means', 'dispersions', 'dispersions_norm', 'mean', 'std'
uns: 'hvg', 'leiden', 'leiden_res0_20_colors', 'leiden_res0_40_colors', 'leiden_res0_60_colors', 'leiden_res0_80_colors', 'leiden_res1_colors', 'log1p', 'neighbors', 'pca', 'sample_colors', 'umap'
obsm: 'X_pca', 'X_umap'
varm: 'PCs'
obsp: 'connectivities', 'distances'
saving the all files after cell score computing
save_files = [
'/home/jana/balsarc1.h5ad',
'/home/jana/balsarc2.h5ad',
'/home/jana/balsarc3.h5ad',
'/home/jana/balhealth1.h5ad',
'/home/jana/balhealth2.h5ad',
'/home/jana/balhealthy3.h5ad',
'/home/jana/balhealthy4.h5ad',
'/home/jana/balhealthy5.h5ad',
'/home/jana/balhealthy6.h5ad',
'/home/jana/balhealthy7.h5ad',
'/home/jana/balhealthy8.h5ad',
'/home/jana/balhealthy9.h5ad',
'/home/jana/balhealthy10.h5ad'
]
adata_list = [balsarc1, balsarc2, balsarc3, balhealthy1, balhealthy2, balhealthy3, balhealthy4, balhealthy5, balhealthy6, balhealthy7, balhealthy8, balhealthy9, balhealthy10]
# Save each adata to the corresponding file
for adata, save_file in zip(adata_list, save_files):
adata.write_h5ad(save_file)
from matplotlib.pyplot import rc_context
import matplotlib_inline.backend_inline
import warnings
# Suppress UserWarning
with warnings.catch_warnings():
warnings.simplefilter("ignore")
# Suppress DeprecationWarning
warnings.filterwarnings("ignore", category=DeprecationWarning)
# Set matplotlib formats using the updated method
matplotlib_inline.backend_inline.set_matplotlib_formats('retina')
sc.set_figure_params(dpi=80)
with rc_context({'figure.figsize': (4, 4)}):
sc.pl.umap(balsarc1, color = 'leiden')
sc.pl.umap(balsarc2, color = 'leiden')
sc.pl.umap(balsarc3, color = 'leiden')
sc.pl.umap(balhealthy1, color = 'leiden')
sc.pl.umap(balhealthy2, color = 'leiden')
sc.pl.umap(balhealthy3, color = 'leiden')
sc.pl.umap(balhealthy4, color = 'leiden')
sc.pl.umap(balhealthy5, color = 'leiden')
sc.pl.umap(balhealthy6, color = 'leiden')
sc.pl.umap(balhealthy7, color = 'leiden')
sc.pl.umap(balhealthy8, color = 'leiden')
sc.pl.umap(balhealthy9, color = 'leiden')
sc.pl.umap(balhealthy10, color = 'leiden')
Overlapping Genes finding across all samples
var_names = balsarc2.var_names.intersection(balsarc1.var_names)
var_names = var_names.intersection(balsarc3.var_names)
var_names = var_names.intersection(balhealthy1.var_names)
var_names = var_names.intersection(balhealthy2.var_names)
var_names = var_names.intersection(balhealthy3.var_names)
var_names = var_names.intersection(balhealthy4.var_names)
var_names = var_names.intersection(balhealthy5.var_names)
var_names = var_names.intersection(balhealthy6.var_names)
var_names = var_names.intersection(balhealthy7.var_names)
var_names = var_names.intersection(balhealthy8.var_names)
var_names = var_names.intersection(balhealthy9.var_names)
var_names = var_names.intersection(balhealthy10.var_names)
print("Overlapping Genes length across all samples:", len(var_names))
Overlapping Genes length across all samples: 14143
Overlapping Genes as var_names assigned to all samples
balsarc1 = balsarc1[:, var_names]
balsarc2 = balsarc2[:, var_names]
balsarc3 = balsarc3[:, var_names]
balhealthy1 = balhealthy1[:, var_names]
balhealthy2 = balhealthy2[:, var_names]
balhealthy3 = balhealthy3[:, var_names]
balhealthy4 = balhealthy4[:, var_names]
balhealthy5 = balhealthy5[:, var_names]
balhealthy6 = balhealthy6[:, var_names]
balhealthy7 = balhealthy7[:, var_names]
balhealthy8 = balhealthy8[:, var_names]
balhealthy9 = balhealthy9[:, var_names]
balhealthy10 = balhealthy10[:, var_names]
Ingestion: This function combines embeddings and annotations from an adata object with those from a reference dataset. Since our observation 'pbmchealthy4' exhibits the highest number of Leiden clusters, we selected it as the reference dataset.
%%time
import numba
import warnings
# Suppress NumbaPerformanceWarning
warnings.filterwarnings("ignore", category=numba.NumbaPerformanceWarning)
sc.tl.ingest(balsarc1, balsarc2, obs= 'leiden')
sc.tl.ingest(balsarc3, balsarc2, obs= 'leiden')
sc.tl.ingest(balhealthy1, balsarc2, obs= 'leiden')
sc.tl.ingest(balhealthy2, balsarc2, obs= 'leiden')
sc.tl.ingest(balhealthy3, balsarc2, obs= 'leiden')
sc.tl.ingest(balhealthy4, balsarc2, obs= 'leiden')
sc.tl.ingest(balhealthy5, balsarc2, obs= 'leiden')
sc.tl.ingest(balhealthy6, balsarc2, obs= 'leiden')
sc.tl.ingest(balhealthy7, balsarc2, obs= 'leiden')
sc.tl.ingest(balhealthy8, balsarc2, obs= 'leiden')
sc.tl.ingest(balhealthy9, balsarc2, obs= 'leiden')
sc.tl.ingest(balhealthy10, balsarc2, obs= 'leiden')
running ingest
finished (0:00:30)
running ingest
finished (0:00:15)
running ingest
finished (0:00:17)
running ingest
finished (0:00:17)
running ingest
finished (0:00:15)
running ingest
finished (0:00:15)
running ingest
finished (0:00:17)
running ingest
finished (0:00:15)
running ingest
finished (0:00:17)
running ingest
finished (0:00:17)
running ingest
finished (0:00:12)
running ingest
finished (0:00:13)
CPU times: user 4min 37s, sys: 1min 16s, total: 5min 53s
Wall time: 3min 42s
from matplotlib.pyplot import rc_context
sc.set_figure_params(dpi=100)
with rc_context({'figure.figsize': (4, 4)}):
sc.pl.umap(balsarc1, color = 'leiden')
sc.pl.umap(balsarc2, color = 'leiden')
sc.pl.umap(balsarc3, color = 'leiden')
sc.pl.umap(balhealthy1, color = 'leiden')
sc.pl.umap(balhealthy2, color = 'leiden')
sc.pl.umap(balhealthy3, color = 'leiden')
sc.pl.umap(balhealthy4, color = 'leiden')
sc.pl.umap(balhealthy5, color = 'leiden')
sc.pl.umap(balhealthy6, color = 'leiden')
sc.pl.umap(balhealthy7, color = 'leiden')
sc.pl.umap(balhealthy8, color = 'leiden')
sc.pl.umap(balhealthy9, color = 'leiden')
sc.pl.umap(balhealthy10, color = 'leiden')
#we used adata as merged object
adata = balsarc1.concatenate(balsarc2, balsarc3, balhealthy1, balhealthy2, balhealthy3, balhealthy4, balhealthy5, balhealthy6, balhealthy7, balhealthy8, balhealthy9, balhealthy10)
Displaying merged object observation and variables types
# Displaying merged object observation and variables types
print (adata)
print ("------###---")
#Displaying counts cells sample wise
display(adata.obs['sample'].value_counts())
#Displaying counts total cells counts of healthy and Sarcoid (Sarc)
print ("------###---")
display(adata.obs['type'].value_counts())
AnnData object with n_obs × n_vars = 74394 × 14143
obs: 'type', 'sample', 'percent_chrY', 'XIST-counts', 'n_genes', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'total_counts_ribo', 'pct_counts_ribo', 'leiden', 'leiden_res0_20', 'leiden_res0_40', 'leiden_res0_60', 'leiden_res0_80', 'leiden_res1', 'S_score', 'G2M_score', 'phase', 'batch'
var: 'mt', 'ribo', 'gene_ids-0', 'feature_types-0', 'n_cells-0', 'n_cells_by_counts-0', 'mean_counts-0', 'pct_dropout_by_counts-0', 'total_counts-0', 'highly_variable-0', 'means-0', 'dispersions-0', 'dispersions_norm-0', 'mean-0', 'std-0', 'gene_ids-1', 'feature_types-1', 'n_cells-1', 'n_cells_by_counts-1', 'mean_counts-1', 'pct_dropout_by_counts-1', 'total_counts-1', 'highly_variable-1', 'means-1', 'dispersions-1', 'dispersions_norm-1', 'mean-1', 'std-1', 'gene_ids-10', 'n_cells-10', 'n_cells_by_counts-10', 'mean_counts-10', 'pct_dropout_by_counts-10', 'total_counts-10', 'highly_variable-10', 'means-10', 'dispersions-10', 'dispersions_norm-10', 'mean-10', 'std-10', 'gene_ids-11', 'n_cells-11', 'n_cells_by_counts-11', 'mean_counts-11', 'pct_dropout_by_counts-11', 'total_counts-11', 'highly_variable-11', 'means-11', 'dispersions-11', 'dispersions_norm-11', 'mean-11', 'std-11', 'gene_ids-12', 'n_cells-12', 'n_cells_by_counts-12', 'mean_counts-12', 'pct_dropout_by_counts-12', 'total_counts-12', 'highly_variable-12', 'means-12', 'dispersions-12', 'dispersions_norm-12', 'mean-12', 'std-12', 'gene_ids-2', 'feature_types-2', 'n_cells-2', 'n_cells_by_counts-2', 'mean_counts-2', 'pct_dropout_by_counts-2', 'total_counts-2', 'highly_variable-2', 'means-2', 'dispersions-2', 'dispersions_norm-2', 'mean-2', 'std-2', 'gene_ids-3', 'n_cells-3', 'n_cells_by_counts-3', 'mean_counts-3', 'pct_dropout_by_counts-3', 'total_counts-3', 'highly_variable-3', 'means-3', 'dispersions-3', 'dispersions_norm-3', 'mean-3', 'std-3', 'gene_ids-4', 'n_cells-4', 'n_cells_by_counts-4', 'mean_counts-4', 'pct_dropout_by_counts-4', 'total_counts-4', 'highly_variable-4', 'means-4', 'dispersions-4', 'dispersions_norm-4', 'mean-4', 'std-4', 'gene_ids-5', 'n_cells-5', 'n_cells_by_counts-5', 'mean_counts-5', 'pct_dropout_by_counts-5', 'total_counts-5', 'highly_variable-5', 'means-5', 'dispersions-5', 'dispersions_norm-5', 'mean-5', 'std-5', 'gene_ids-6', 'n_cells-6', 'n_cells_by_counts-6', 'mean_counts-6', 'pct_dropout_by_counts-6', 'total_counts-6', 'highly_variable-6', 'means-6', 'dispersions-6', 'dispersions_norm-6', 'mean-6', 'std-6', 'gene_ids-7', 'n_cells-7', 'n_cells_by_counts-7', 'mean_counts-7', 'pct_dropout_by_counts-7', 'total_counts-7', 'highly_variable-7', 'means-7', 'dispersions-7', 'dispersions_norm-7', 'mean-7', 'std-7', 'gene_ids-8', 'n_cells-8', 'n_cells_by_counts-8', 'mean_counts-8', 'pct_dropout_by_counts-8', 'total_counts-8', 'highly_variable-8', 'means-8', 'dispersions-8', 'dispersions_norm-8', 'mean-8', 'std-8', 'gene_ids-9', 'n_cells-9', 'n_cells_by_counts-9', 'mean_counts-9', 'pct_dropout_by_counts-9', 'total_counts-9', 'highly_variable-9', 'means-9', 'dispersions-9', 'dispersions_norm-9', 'mean-9', 'std-9'
obsm: 'X_pca', 'X_umap'
------###---
BAL-Sarc-2 11241 BAL-Sarc-1 11012 BAL-healthy-8 6108 BAL-healthy-2 5710 BAL-healthy-1 5607 BAL-healthy-7 5327 BAL-healthy-5 5220 BAL-healthy-6 4731 BAL-Sarc-3 4547 BAL-healthy-4 4397 BAL-healthy-3 4336 BAL-healthy-10 3720 BAL-healthy-9 2438 Name: sample, dtype: int64
------###---
Healthy 47594 Sarcoidosis 26800 Name: type, dtype: int64
# Print merged adata var (variable) types
display (adata.var)
| mt | ribo | gene_ids-0 | feature_types-0 | n_cells-0 | n_cells_by_counts-0 | mean_counts-0 | pct_dropout_by_counts-0 | total_counts-0 | highly_variable-0 | ... | n_cells_by_counts-9 | mean_counts-9 | pct_dropout_by_counts-9 | total_counts-9 | highly_variable-9 | means-9 | dispersions-9 | dispersions_norm-9 | mean-9 | std-9 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| LINC00115 | False | False | ENSG00000225880 | Gene Expression | 57 | 57 | 0.004876 | 99.512445 | 57.0 | False | ... | 157 | 0.029280 | 97.196929 | 164.0 | True | 0.020375 | 0.647458 | 0.826835 | -8.041524e-13 | 0.095505 |
| FAM41C | False | False | ENSG00000230368 | Gene Expression | 37 | 37 | 0.003165 | 99.683517 | 37.0 | False | ... | 147 | 0.027495 | 97.375469 | 154.0 | False | 0.012498 | -0.241261 | -0.480772 | -2.624917e-11 | 0.066336 |
| NOC2L | False | False | ENSG00000188976 | Gene Expression | 1377 | 1377 | 0.129843 | 88.221709 | 1518.0 | True | ... | 1971 | 0.460811 | 64.809855 | 2581.0 | False | 0.222090 | 0.204492 | 0.175082 | 2.056991e-11 | 0.282694 |
| KLHL17 | False | False | ENSG00000187961 | Gene Expression | 77 | 77 | 0.006586 | 99.341374 | 77.0 | False | ... | 159 | 0.028388 | 97.161221 | 159.0 | False | 0.013591 | -0.213362 | -0.439723 | -1.765789e-11 | 0.069086 |
| PLEKHN1 | False | False | ENSG00000187583 | Gene Expression | 37 | 37 | 0.003250 | 99.683517 | 38.0 | False | ... | 36 | 0.006606 | 99.357258 | 37.0 | False | 0.007223 | 1.273623 | 1.748135 | -1.606426e-11 | 0.061538 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| AC011043.1 | False | False | ENSG00000276256 | Gene Expression | 3 | 3 | 0.000257 | 99.974339 | 3.0 | False | ... | 80 | 0.014462 | 98.571684 | 81.0 | False | 0.007251 | -0.257485 | -0.504642 | 2.619076e-11 | 0.051217 |
| AL354822.1 | False | False | ENSG00000278384 | Gene Expression | 221 | 221 | 0.019246 | 98.109657 | 225.0 | False | ... | 560 | 0.111766 | 90.001785 | 626.0 | False | 0.057898 | 0.246208 | 0.236461 | -5.957352e-11 | 0.151154 |
| AL592183.1 | False | False | ENSG00000273748 | Gene Expression | 34 | 34 | 0.002908 | 99.709178 | 34.0 | False | ... | 305 | 0.055526 | 94.554544 | 311.0 | False | 0.026447 | 0.058870 | -0.039177 | -2.351471e-11 | 0.096522 |
| AC240274.1 | False | False | ENSG00000271254 | Gene Expression | 232 | 232 | 0.020101 | 98.015568 | 235.0 | False | ... | 413 | 0.082307 | 92.626317 | 461.0 | False | 0.033696 | -0.473253 | -0.822111 | -2.721484e-13 | 0.104288 |
| AC007325.4 | False | False | ENSG00000278817 | Gene Expression | 201 | 201 | 0.017620 | 98.280729 | 206.0 | False | ... | 416 | 0.079450 | 92.572755 | 445.0 | False | 0.031798 | -0.595366 | -1.001779 | -2.535163e-11 | 0.098411 |
14143 rows × 161 columns
# Print merged adata obs observation types
display (adata.obs)
| type | sample | percent_chrY | XIST-counts | n_genes | n_genes_by_counts | total_counts | total_counts_mt | pct_counts_mt | total_counts_ribo | ... | leiden | leiden_res0_20 | leiden_res0_40 | leiden_res0_60 | leiden_res0_80 | leiden_res1 | S_score | G2M_score | phase | batch | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| AAACCCAAGAAGAGCA-1-0 | Sarcoidosis | BAL-Sarc-1 | 0.125612 | 0.0 | 2350 | 2350 | 7961.0 | 695.0 | 8.730059 | 984.0 | ... | 5 | 0 | 2 | 3 | 4 | 7 | -0.036949 | -0.071471 | G1 | 0 |
| AAACCCAAGAGAACCC-1-0 | Sarcoidosis | BAL-Sarc-1 | 0.087408 | 0.0 | 4492 | 4492 | 19449.0 | 1400.0 | 7.198314 | 1776.0 | ... | 0 | 0 | 1 | 1 | 6 | 8 | 0.023165 | -0.022110 | S | 0 |
| AAACCCAAGCGCTGAA-1-0 | Sarcoidosis | BAL-Sarc-1 | 0.099226 | 0.0 | 2006 | 2006 | 5039.0 | 388.0 | 7.699940 | 587.0 | ... | 8 | 3 | 5 | 8 | 8 | 9 | 0.018635 | 0.622611 | G2M | 0 |
| AAACCCAAGCGGGTAT-1-0 | Sarcoidosis | BAL-Sarc-1 | 0.133571 | 0.0 | 1569 | 1569 | 4492.0 | 340.0 | 7.569011 | 727.0 | ... | 1 | 0 | 0 | 0 | 5 | 1 | -0.034260 | -0.012191 | G1 | 0 |
| AAACCCAAGCTGCCTG-1-0 | Sarcoidosis | BAL-Sarc-1 | 0.629591 | 0.0 | 547 | 547 | 953.0 | 38.0 | 3.987408 | 237.0 | ... | 6 | 2 | 3 | 5 | 7 | 4 | 0.107249 | -0.023425 | S | 0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| TTTGGTTTCCCAACTC-12 | Healthy | BAL-healthy-10 | 0.088613 | 0.0 | 1098 | 1096 | 2254.0 | 358.0 | 15.882875 | 247.0 | ... | 6 | 2 | 3 | 6 | 6 | 10 | -0.100588 | -0.052388 | G1 | 12 |
| TTTGGTTTCTGCTGAA-12 | Healthy | BAL-healthy-10 | 0.105251 | 0.0 | 5325 | 5323 | 36102.0 | 4156.0 | 11.511827 | 2797.0 | ... | 11 | 0 | 0 | 3 | 12 | 15 | -0.077294 | -0.111643 | G1 | 12 |
| TTTGTTGCACGTCATA-12 | Healthy | BAL-healthy-10 | 0.143119 | 0.0 | 5171 | 5168 | 22356.0 | 2060.0 | 9.214529 | 1264.0 | ... | 5 | 3 | 4 | 10 | 13 | 16 | -0.093467 | -0.105851 | G1 | 12 |
| TTTGTTGGTAAGGTCG-12 | Healthy | BAL-healthy-10 | 0.147944 | 0.0 | 3998 | 3998 | 20278.0 | 2364.0 | 11.657955 | 2229.0 | ... | 1 | 0 | 1 | 1 | 2 | 9 | -0.070404 | -0.099629 | G1 | 12 |
| TTTGTTGTCAACACCA-12 | Healthy | BAL-healthy-10 | 0.182432 | 0.0 | 3091 | 3091 | 10963.0 | 1287.0 | 11.739488 | 1386.0 | ... | 3 | 3 | 4 | 5 | 9 | 8 | -0.098344 | -0.125085 | G1 | 12 |
74394 rows × 21 columns
UMAP after sample integration
sc.pl.umap(adata, color="sample")
sc.pl.umap(adata, color="leiden")
#Doublet detetection using Scrublet
import warnings
# Suppress RuntimeWarning for invalid value encountered in log
with warnings.catch_warnings():
warnings.filterwarnings("ignore", message="invalid value encountered in log")
#warnings.filterwarnings("ignore", message="invalid value encountered in sqrt")
print ("Doublet detection")
import scrublet as scr
# split per batch into new objects.
batches = adata.obs['sample'].cat.categories.tolist()
alldata = {}
for batch in batches:
tmp = adata[adata.obs['sample'] == batch,]
print(batch, ":", tmp.shape[0], " cells")
scrub = scr.Scrublet(tmp.raw.X, expected_doublet_rate = 0.06)
out = scrub.scrub_doublets(verbose=False, min_counts=2, min_cells=3,min_gene_variability_pctl=85,n_prin_comps=20)
alldata[batch] = pd.DataFrame({'doublet_score':out[0],'predicted_doublets':out[1]},index = tmp.obs.index)
print(alldata[batch].predicted_doublets.sum(), " predicted_doublets")
print(round(alldata[batch].predicted_doublets.sum()/tmp.shape[0]*100,2), " predicted doublet Percentage\n")
Doublet detection BAL-Sarc-1 : 11012 cells 5 predicted_doublets 0.05 predicted doublet Percentage BAL-Sarc-2 : 11241 cells 5 predicted_doublets 0.04 predicted doublet Percentage BAL-Sarc-3 : 4547 cells 3 predicted_doublets 0.07 predicted doublet Percentage BAL-healthy-1 : 5607 cells 4 predicted_doublets 0.07 predicted doublet Percentage BAL-healthy-2 : 5710 cells 5 predicted_doublets 0.09 predicted doublet Percentage BAL-healthy-3 : 4336 cells 5 predicted_doublets 0.12 predicted doublet Percentage BAL-healthy-4 : 4397 cells 11 predicted_doublets 0.25 predicted doublet Percentage BAL-healthy-5 : 5220 cells 0 predicted_doublets 0.0 predicted doublet Percentage BAL-healthy-6 : 4731 cells 5 predicted_doublets 0.11 predicted doublet Percentage BAL-healthy-7 : 5327 cells 0 predicted_doublets 0.0 predicted doublet Percentage BAL-healthy-8 : 6108 cells 6 predicted_doublets 0.1 predicted doublet Percentage BAL-healthy-9 : 2438 cells 4 predicted_doublets 0.16 predicted doublet Percentage BAL-healthy-10 : 3720 cells 7 predicted_doublets 0.19 predicted doublet Percentage
Histogram plot of doublet detection
#Histogram plot doublet detection
scrub.plot_histogram();
Doublet detector predictions adding to the adata object
# Doublet detector predictions adding to the adata object.
scrub_pred = pd.concat(alldata.values())
adata.obs['doublet_scores'] = scrub_pred['doublet_score']
adata.obs['predicted_doublets'] = scrub_pred['predicted_doublets']
print("Total predicted doublets of all samples:", sum(adata.obs['predicted_doublets']))
Total predicted doublets of all samples: 60
True means: confirmed doublets and False means: singlets
# add in column with singlet/doublet instead of True/False
%matplotlib inline
adata.obs['doublet_info'] = adata.obs["predicted_doublets"].astype(str)
sc.pl.violin(adata, 'n_genes_by_counts',
jitter=0.4, groupby = 'doublet_info', rotation=45)
... storing 'doublet_info' as categorical
Displaying Doublet scores, Doublet info and Sample wise distrubtion
#Displaying Doublet scores, Doublet info and Sample wise distrubtion
print ("Doublet finding plots with scores and info across the samples")
sc.pl.umap(adata, color=['doublet_scores','doublet_info','sample'])
Doublet finding plots with scores and info across the samples
Doublet removing and Rest samples for post processing
#Doublet removing and Rest samples for post processing
print ("Considering the False Doublet finding information, means we are considering non doublet portion for the rest of tha analysis")
# also revert back to the raw counts as the main matrix in adata
adata = adata.raw.to_adata()
adata = adata[adata.obs['doublet_info'] == 'False',:]
print("Current shape of the data:", adata.shape)
Considering the False Doublet finding information, means we are considering non doublet portion for the rest of tha analysis Current shape of the data: (74334, 14143)
Displaying merged object
adata
View of AnnData object with n_obs × n_vars = 74334 × 14143
obs: 'type', 'sample', 'percent_chrY', 'XIST-counts', 'n_genes', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'total_counts_ribo', 'pct_counts_ribo', 'leiden', 'leiden_res0_20', 'leiden_res0_40', 'leiden_res0_60', 'leiden_res0_80', 'leiden_res1', 'S_score', 'G2M_score', 'phase', 'batch', 'doublet_scores', 'predicted_doublets', 'doublet_info'
uns: 'sample_colors', 'leiden_colors', 'doublet_info_colors'
obsm: 'X_pca', 'X_umap'
We conducted a comparison between our dataset and two existing methods: BBKN (Batch balanced KNN) introduced in the paper published in 2019 in Bioinformatics, and Harmony, presented in the 2019 Nature paper.
BBKN: Batch balanced KNN method
%%time
# Copy adata not to modify UMAP in the original adata object
adata_temp=adata.copy()
sc.tl.pca(adata_temp)
#%%time
sc.external.pp.bbknn(adata_temp, batch_key='sample')
sc.tl.umap(adata_temp)
sc.pl.umap(adata_temp, color=['sample', 'leiden'])
del adata_temp
computing PCA
with n_comps=50
finished (0:02:39)
computing batch balanced neighbors
finished: added to `.uns['neighbors']`
`.obsp['distances']`, distances for each pair of neighbors
`.obsp['connectivities']`, weighted adjacency matrix (0:00:57)
computing UMAP
finished: added
'X_umap', UMAP coordinates (adata.obsm) (0:02:42)
CPU times: user 12min 41s, sys: 11min 30s, total: 24min 12s Wall time: 6min 25s
Harmony method
%%time
# Copy adata not to modify UMAP in the original adata object
adata_temp=adata.copy()
sce.pp.harmony_integrate(adata_temp, 'sample')
def post_process_harmonization(adata_temp):
print("Post-processing of Harmonization")
# Set current PCA to be aligned with Harmonized PCA values
adata_temp.obsm['X_pca'] = adata_temp.obsm['X_pca_harmony']
# Compute neighborhood graphs and clustering
print("Computing neighborhood graphs and Clustering")
sc.pp.neighbors(adata_temp, n_neighbors=15, n_pcs=20)
# Cluster the neighborhood graph using Leiden Clustering algorithm
sc.tl.leiden(adata_temp)
sc.tl.umap(adata_temp)
# Example usage
post_process_harmonization(adata_temp)
sc.tl.umap(adata_temp)
sc.pl.umap(adata_temp, color=['sample'])
sc.pl.umap(adata_temp, color=['leiden'])
del adata_temp
2024-03-20 13:35:52,129 - harmonypy - INFO - Computing initial centroids with sklearn.KMeans... INFO:harmonypy:Computing initial centroids with sklearn.KMeans... 2024-03-20 13:36:02,605 - harmonypy - INFO - sklearn.KMeans initialization complete. INFO:harmonypy:sklearn.KMeans initialization complete. 2024-03-20 13:36:03,609 - harmonypy - INFO - Iteration 1 of 10 INFO:harmonypy:Iteration 1 of 10 2024-03-20 13:36:56,006 - harmonypy - INFO - Iteration 2 of 10 INFO:harmonypy:Iteration 2 of 10 2024-03-20 13:37:49,206 - harmonypy - INFO - Iteration 3 of 10 INFO:harmonypy:Iteration 3 of 10 2024-03-20 13:38:40,775 - harmonypy - INFO - Iteration 4 of 10 INFO:harmonypy:Iteration 4 of 10 2024-03-20 13:39:29,468 - harmonypy - INFO - Iteration 5 of 10 INFO:harmonypy:Iteration 5 of 10 2024-03-20 13:40:06,216 - harmonypy - INFO - Iteration 6 of 10 INFO:harmonypy:Iteration 6 of 10 2024-03-20 13:40:33,879 - harmonypy - INFO - Iteration 7 of 10 INFO:harmonypy:Iteration 7 of 10 2024-03-20 13:41:04,281 - harmonypy - INFO - Iteration 8 of 10 INFO:harmonypy:Iteration 8 of 10 2024-03-20 13:41:33,418 - harmonypy - INFO - Converged after 8 iterations INFO:harmonypy:Converged after 8 iterations
Post-processing of Harmonization
Computing neighborhood graphs and Clustering
computing neighbors
using 'X_pca' with n_pcs = 20
finished: added to `.uns['neighbors']`
`.obsp['distances']`, distances for each pair of neighbors
`.obsp['connectivities']`, weighted adjacency matrix (0:00:30)
running Leiden clustering
finished: found 18 clusters and added
'leiden', the cluster labels (adata.obs, categorical) (0:00:23)
computing UMAP
finished: added
'X_umap', UMAP coordinates (adata.obsm) (0:01:52)
computing UMAP
finished: added
'X_umap', UMAP coordinates (adata.obsm) (0:01:52)
CPU times: user 1h 6min 11s, sys: 1h 29min 53s, total: 2h 36min 5s Wall time: 10min 25s
Violin plot of 'G2M_score', 'S_score' across samples
sc.pl.violin(adata, keys=['G2M_score', 'S_score'], groupby='sample',rotation=90)
PCA plot of 'G2M_score', 'S_score' and 'phase' across samples
sc.pl.pca(adata, color= ['S_score', 'G2M_score','phase'])
Display merged object adata
adata
AnnData object with n_obs × n_vars = 74334 × 14143
obs: 'type', 'sample', 'percent_chrY', 'XIST-counts', 'n_genes', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'total_counts_ribo', 'pct_counts_ribo', 'leiden', 'leiden_res0_20', 'leiden_res0_40', 'leiden_res0_60', 'leiden_res0_80', 'leiden_res1', 'S_score', 'G2M_score', 'phase', 'batch', 'doublet_scores', 'predicted_doublets', 'doublet_info'
uns: 'sample_colors', 'leiden_colors', 'doublet_info_colors', 'phase_colors'
obsm: 'X_pca', 'X_umap'
Harmony method was appiled into actual merged object
%%time
sce.pp.harmony_integrate(adata, 'sample')
def post_process_harmonization(adata):
print("Post-processing of Harmonization")
# Set current PCA to be aligned with Harmonized PCA values
adata.obsm['X_pca'] = adata.obsm['X_pca_harmony']
# Compute neighborhood graphs and clustering
print("Computing neighborhood graphs and Clustering")
sc.pp.neighbors(adata, n_neighbors=15, n_pcs=20)
# Cluster the neighborhood graph using Leiden Clustering algorithm
sc.tl.leiden(adata)
sc.tl.umap(adata)
# usage
post_process_harmonization(adata)
sc.tl.umap(adata)
sc.pl.umap(adata, color=['sample', 'leiden'])
2024-03-20 13:46:24,205 - harmonypy - INFO - Computing initial centroids with sklearn.KMeans... INFO:harmonypy:Computing initial centroids with sklearn.KMeans... 2024-03-20 13:46:35,092 - harmonypy - INFO - sklearn.KMeans initialization complete. INFO:harmonypy:sklearn.KMeans initialization complete. 2024-03-20 13:46:36,094 - harmonypy - INFO - Iteration 1 of 10 INFO:harmonypy:Iteration 1 of 10 2024-03-20 13:47:30,413 - harmonypy - INFO - Iteration 2 of 10 INFO:harmonypy:Iteration 2 of 10 2024-03-20 13:48:23,701 - harmonypy - INFO - Iteration 3 of 10 INFO:harmonypy:Iteration 3 of 10 2024-03-20 13:49:16,851 - harmonypy - INFO - Iteration 4 of 10 INFO:harmonypy:Iteration 4 of 10 2024-03-20 13:50:08,835 - harmonypy - INFO - Iteration 5 of 10 INFO:harmonypy:Iteration 5 of 10 2024-03-20 13:50:47,113 - harmonypy - INFO - Iteration 6 of 10 INFO:harmonypy:Iteration 6 of 10 2024-03-20 13:51:15,243 - harmonypy - INFO - Iteration 7 of 10 INFO:harmonypy:Iteration 7 of 10 2024-03-20 13:51:46,336 - harmonypy - INFO - Iteration 8 of 10 INFO:harmonypy:Iteration 8 of 10 2024-03-20 13:52:09,464 - harmonypy - INFO - Converged after 8 iterations INFO:harmonypy:Converged after 8 iterations
Post-processing of Harmonization
Computing neighborhood graphs and Clustering
computing neighbors
using 'X_pca' with n_pcs = 20
finished: added to `.uns['neighbors']`
`.obsp['distances']`, distances for each pair of neighbors
`.obsp['connectivities']`, weighted adjacency matrix (0:00:26)
running Leiden clustering
finished: found 18 clusters and added
'leiden', the cluster labels (adata.obs, categorical) (0:00:23)
computing UMAP
finished: added
'X_umap', UMAP coordinates (adata.obsm) (0:01:56)
computing UMAP
finished: added
'X_umap', UMAP coordinates (adata.obsm) (0:01:52)
CPU times: user 1h 5min 57s, sys: 1h 29min 4s, total: 2h 35min 1s Wall time: 10min 27s
UMAP using rc_context method
sc.settings.set_figure_params(dpi=100, facecolor='white')
with rc_context({'figure.figsize': (4, 4)}):
sc.pl.umap(adata, color = 'sample')
sc.pl.umap(adata, color = 'leiden')
import pandas as pd
# Assuming 'merged' is your DataFrame
counts = adata.obs.groupby(['sample', 'leiden']).count().reset_index()
def map_per(row):
samp, y = row['sample'], row['total_counts']
tot = counts.loc[counts['sample'] == samp, 'total_counts'].sum()
return (y / tot) * 100 if tot != 0 else 0
counts['per'] = counts.apply(map_per, axis=1)
# Display the resulting DataFrame
display(counts)
| sample | leiden | type | percent_chrY | XIST-counts | n_genes | n_genes_by_counts | total_counts | total_counts_mt | pct_counts_mt | ... | leiden_res0_80 | leiden_res1 | S_score | G2M_score | phase | batch | doublet_scores | predicted_doublets | doublet_info | per | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | BAL-Sarc-1 | 0 | 1421 | 1421 | 1421 | 1421 | 1421 | 1421 | 1421 | 1421 | ... | 1421 | 1421 | 1421 | 1421 | 1421 | 1421 | 1421 | 1421 | 1421 | 12.909966 |
| 1 | BAL-Sarc-1 | 1 | 1363 | 1363 | 1363 | 1363 | 1363 | 1363 | 1363 | 1363 | ... | 1363 | 1363 | 1363 | 1363 | 1363 | 1363 | 1363 | 1363 | 1363 | 12.383029 |
| 2 | BAL-Sarc-1 | 2 | 1109 | 1109 | 1109 | 1109 | 1109 | 1109 | 1109 | 1109 | ... | 1109 | 1109 | 1109 | 1109 | 1109 | 1109 | 1109 | 1109 | 1109 | 10.075407 |
| 3 | BAL-Sarc-1 | 3 | 875 | 875 | 875 | 875 | 875 | 875 | 875 | 875 | ... | 875 | 875 | 875 | 875 | 875 | 875 | 875 | 875 | 875 | 7.949487 |
| 4 | BAL-Sarc-1 | 4 | 1003 | 1003 | 1003 | 1003 | 1003 | 1003 | 1003 | 1003 | ... | 1003 | 1003 | 1003 | 1003 | 1003 | 1003 | 1003 | 1003 | 1003 | 9.112383 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 229 | BAL-healthy-10 | 13 | 33 | 33 | 33 | 33 | 33 | 33 | 33 | 33 | ... | 33 | 33 | 33 | 33 | 33 | 33 | 33 | 33 | 33 | 0.888769 |
| 230 | BAL-healthy-10 | 14 | 24 | 24 | 24 | 24 | 24 | 24 | 24 | 24 | ... | 24 | 24 | 24 | 24 | 24 | 24 | 24 | 24 | 24 | 0.646378 |
| 231 | BAL-healthy-10 | 15 | 10 | 10 | 10 | 10 | 10 | 10 | 10 | 10 | ... | 10 | 10 | 10 | 10 | 10 | 10 | 10 | 10 | 10 | 0.269324 |
| 232 | BAL-healthy-10 | 16 | 16 | 16 | 16 | 16 | 16 | 16 | 16 | 16 | ... | 16 | 16 | 16 | 16 | 16 | 16 | 16 | 16 | 16 | 0.430918 |
| 233 | BAL-healthy-10 | 17 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.000000 |
234 rows × 25 columns
Plot of Percentage of Total Counts across leiden clusters
import seaborn as sns
import matplotlib.pyplot as plt
plt.figure(figsize = (10,3))
ax = sns.barplot(x = 'leiden', y = 'per', hue = 'sample', data = counts)
ax.set(xlabel='Leiden',
ylabel='Percenatge_total_counts')
plt.legend(bbox_to_anchor=(1.4,0.8))
<matplotlib.legend.Legend at 0x7f90931fce80>
Plot of Percentage of Total Counts across samples
import seaborn as sns
import matplotlib.pyplot as plt
plt.figure(figsize=(10, 3))
ax = sns.barplot(x='sample', y='per', hue='leiden', data=counts)
ax.set(xlabel='sample ',
ylabel='Percentage_total_counts')
plt.legend(bbox_to_anchor=(1.4, 0.8))
# Rotate x-axis labels by 90 degrees
ax.set_xticklabels(ax.get_xticklabels(), rotation=90)
plt.show()
Sample wise density estimation
#Gaussian kernel density estimation is used to calculate the density of cells in an embedded space.
#This can be performed per category over a categorical cell annotation.
#Calcuting the density per sample
sc.tl.embedding_density(adata, groupby='sample')
#plot the density per sample
sc.pl.embedding_density(adata, groupby='sample')
computing density on 'umap'
--> added
'umap_density_sample', densities (adata.obs)
'umap_density_sample_params', parameter (adata.uns)
Sample wise density estimation across leiden clusters
#Gaussian kernel density estimation is used to calculate the density of cells in an embedded space.
#This can be performed per category over a categorical cell annotation.
#Calcuting the density per sample
sc.tl.embedding_density(adata, groupby='leiden')
#plot the density per sample
sc.pl.embedding_density(adata, groupby='leiden')
computing density on 'umap'
--> added
'umap_density_leiden', densities (adata.obs)
'umap_density_leiden_params', parameter (adata.uns)
#Computing with a series of resolution parameters and silhouette_scores.
#Like various algorithms, Leiden has also a parameter named the resolution.
#It can control the coarseness of the clustering.
#Higher values of resolution mean it leads to more clusters.
#Computing Silhouette Coefficient or Silhouette Score, a metric that was used to calculate the goodness of a clustering.
# -1 <= silhouette score<= 1.
# Copy adata not to modify UMAP in the original adata object
adata_temp=adata.copy()
from sklearn.metrics import silhouette_score
# Define a list of resolution parameters
resolutions = [round(r, 2) for r in [.05] + list(np.linspace(.1, 1.6, 16))]
# Print a message indicating the start of the computation
print("Computing silhouette scores with different resolution parameters")
# Iterate over each resolution parameter and compute the silhouette score
for resolution in resolutions:
# Apply the Leiden clustering algorithm with the current resolution parameter
sc.tl.leiden(adata_temp, resolution=resolution)
# Compute the silhouette score for the clustering result
silhouette = silhouette_score(adata_temp.obsm['X_umap'], adata_temp.obs[f'leiden'], metric='euclidean')
# Print the silhouette score for the current resolution parameter
print(f"Silhouette score for resolution {resolution}: {silhouette}\n")
#delete temporary adata_temp
del adata_temp
Computing silhouette scores with different resolution parameters
running Leiden clustering
finished: found 5 clusters and added
'leiden', the cluster labels (adata.obs, categorical) (0:00:18)
Silhouette score for resolution 0.05: 0.2511586844921112
running Leiden clustering
finished: found 7 clusters and added
'leiden', the cluster labels (adata.obs, categorical) (0:00:18)
Silhouette score for resolution 0.1: 0.22194606065750122
running Leiden clustering
finished: found 9 clusters and added
'leiden', the cluster labels (adata.obs, categorical) (0:00:24)
Silhouette score for resolution 0.2: 0.3125682771205902
running Leiden clustering
finished: found 10 clusters and added
'leiden', the cluster labels (adata.obs, categorical) (0:00:39)
Silhouette score for resolution 0.3: 0.2995833456516266
running Leiden clustering
finished: found 13 clusters and added
'leiden', the cluster labels (adata.obs, categorical) (0:00:35)
Silhouette score for resolution 0.4: 0.2517308294773102
running Leiden clustering
finished: found 14 clusters and added
'leiden', the cluster labels (adata.obs, categorical) (0:00:49)
Silhouette score for resolution 0.5: 0.22643430531024933
running Leiden clustering
finished: found 16 clusters and added
'leiden', the cluster labels (adata.obs, categorical) (0:00:55)
Silhouette score for resolution 0.6: 0.18102161586284637
running Leiden clustering
finished: found 17 clusters and added
'leiden', the cluster labels (adata.obs, categorical) (0:00:49)
Silhouette score for resolution 0.7: 0.21208927035331726
running Leiden clustering
finished: found 17 clusters and added
'leiden', the cluster labels (adata.obs, categorical) (0:00:45)
Silhouette score for resolution 0.8: 0.21742117404937744
running Leiden clustering
finished: found 17 clusters and added
'leiden', the cluster labels (adata.obs, categorical) (0:00:26)
Silhouette score for resolution 0.9: 0.19683261215686798
running Leiden clustering
finished: found 18 clusters and added
'leiden', the cluster labels (adata.obs, categorical) (0:00:28)
Silhouette score for resolution 1.0: 0.19462238252162933
running Leiden clustering
finished: found 22 clusters and added
'leiden', the cluster labels (adata.obs, categorical) (0:00:41)
Silhouette score for resolution 1.1: 0.12883161008358002
running Leiden clustering
finished: found 22 clusters and added
'leiden', the cluster labels (adata.obs, categorical) (0:00:53)
Silhouette score for resolution 1.2: 0.15771940350532532
running Leiden clustering
finished: found 23 clusters and added
'leiden', the cluster labels (adata.obs, categorical) (0:01:16)
Silhouette score for resolution 1.3: 0.1640017181634903
running Leiden clustering
finished: found 26 clusters and added
'leiden', the cluster labels (adata.obs, categorical) (0:01:09)
Silhouette score for resolution 1.4: 0.15771687030792236
running Leiden clustering
finished: found 28 clusters and added
'leiden', the cluster labels (adata.obs, categorical) (0:00:40)
Silhouette score for resolution 1.5: 0.14448630809783936
running Leiden clustering
finished: found 31 clusters and added
'leiden', the cluster labels (adata.obs, categorical) (0:00:36)
Silhouette score for resolution 1.6: 0.12331264466047287
Plot of Percentage of Total Counts, samples, clusters and average silhouette_score
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import silhouette_score
# Assuming you have already loaded your data into variables adata and adata_temp
adata_temp=adata.copy()
# Calculate silhouette score
silhouette_avg = silhouette_score(adata.obsm['X_umap'], adata.obs[f'leiden'], metric='euclidean')
print("The average silhouette_score is :", silhouette_avg)
# Plot
plt.figure(figsize=(10, 3))
ax = sns.barplot(x='sample', y='per', hue='leiden', data=counts)
ax.set(xlabel='sample ', ylabel='Percentage_total_counts')
plt.legend(bbox_to_anchor=(1.4, 0.8))
ax.set_xticklabels(ax.get_xticklabels(), rotation=90)
plt.show()
The average silhouette_score is : 0.19462238
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import silhouette_samples, silhouette_score
# Assuming you have already loaded your data into variables adata and adata_temp
# Calculate silhouette scores for each sample
silhouette_scores = silhouette_samples(adata.obsm['X_umap'], np.array(adata.obs[f'leiden']), metric='euclidean')
# Calculate the overall silhouette score
silhouette_avg = silhouette_score(adata.obsm['X_umap'], np.array(adata.obs[f'leiden']), metric='euclidean')
# Get sample names
sample_names = adata.obs['sample'] # Assuming 'sample' is the column name containing sample names
# Create a bar plot of silhouette scores with sample names on the x-axis
plt.figure(figsize=(15, 6)) # Adjusted figure size for better readability
ax = sns.barplot(x=sample_names, y=silhouette_scores, palette='viridis')
# Add a horizontal line for the average silhouette score
plt.axhline(y=silhouette_avg, color="red", linestyle="--")
plt.title('Silhouette plot')
plt.xlabel('Samples')
plt.ylabel('Silhouette score')
plt.xticks(rotation=90) # Rotate x-axis labels for better readability
plt.legend(bbox_to_anchor=(1.4, 0.8))
plt.show()
WARNING:matplotlib.legend:No artists with labels found to put in legend. Note that artists whose label start with an underscore are ignored when legend() is called with no argument.
The keys corresponding to silhouette scores are added to the merged objects. We opted for three resolutions, namely 0.5, 0.6, and 1.0 (default), for Leiden clustering
#clustering comparison
#Based on Silhouette scores, We selected four resolutions 0.4 and 0.5, 0.8, and 1.0 (default) for leiden clustering
#default resolution 1.0 with Silhouette score: 0.19, leiden clustering and key is added
sc.tl.leiden(adata, key_added="leiden_1.0")
#Resolution 0.8 with with Silhouette score: 0.22, leiden clustering and key is added
sc.tl.leiden(adata, resolution = 0.8, key_added = "leiden_0.8")
#Resolution 0.5 with with Silhouette score: 0.23, leiden clustering and key is added
sc.tl.leiden(adata, resolution = 0.5, key_added = "leiden_0.5")
#Resolution 0.4 with with Silhouette score: 0.25, leiden clustering and key is added
sc.tl.leiden(adata, resolution = 0.4, key_added = "leiden_0.4")
running Leiden clustering
finished: found 18 clusters and added
'leiden_1.0', the cluster labels (adata.obs, categorical) (0:00:24)
running Leiden clustering
finished: found 17 clusters and added
'leiden_0.8', the cluster labels (adata.obs, categorical) (0:00:45)
running Leiden clustering
finished: found 14 clusters and added
'leiden_0.5', the cluster labels (adata.obs, categorical) (0:00:47)
running Leiden clustering
finished: found 13 clusters and added
'leiden_0.4', the cluster labels (adata.obs, categorical) (0:00:33)
clustering UMAP plot based Silhouette scores in decresing order
#clustering plot based Silhouette scores in increasing order
sc.pl.umap(adata, color=['leiden_1.0','leiden_0.8', 'leiden_0.5','leiden_0.4'], legend_loc="on data")
#import scipy io package
from scipy import io
save_file = '/home/jana/Updated_SCANPY_QC_filtered_BAL_for_Sarcoidosis.h5ad'
adata.write_h5ad(save_file)
Percentage Checking of Total Counts checking inside leiden_0.5
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
# Assuming 'merged' is your DataFrame
counts_05 = adata.obs.groupby(['sample', 'leiden_0.5']).count().reset_index()
def map_per(row):
samp, y = row['sample'], row['total_counts']
tot05 = counts_05.loc[counts_05['sample'] == samp, 'total_counts'].sum()
return (y / tot05) * 100 if tot05 != 0 else 0
counts_05['pernew'] = counts_05.apply(map_per, axis=1)
# Display the resulting DataFrame
display(counts_05)
| sample | leiden_0.5 | type | percent_chrY | XIST-counts | n_genes | n_genes_by_counts | total_counts | total_counts_mt | pct_counts_mt | ... | batch | doublet_scores | predicted_doublets | doublet_info | umap_density_sample | umap_density_leiden | leiden_1.0 | leiden_0.8 | leiden_0.4 | pernew | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | BAL-Sarc-1 | 0 | 2013 | 2013 | 2013 | 2013 | 2013 | 2013 | 2013 | 2013 | ... | 2013 | 2013 | 2013 | 2013 | 2013 | 2013 | 2013 | 2013 | 2013 | 18.288362 |
| 1 | BAL-Sarc-1 | 1 | 1935 | 1935 | 1935 | 1935 | 1935 | 1935 | 1935 | 1935 | ... | 1935 | 1935 | 1935 | 1935 | 1935 | 1935 | 1935 | 1935 | 1935 | 17.579722 |
| 2 | BAL-Sarc-1 | 2 | 1700 | 1700 | 1700 | 1700 | 1700 | 1700 | 1700 | 1700 | ... | 1700 | 1700 | 1700 | 1700 | 1700 | 1700 | 1700 | 1700 | 1700 | 15.444717 |
| 3 | BAL-Sarc-1 | 3 | 1340 | 1340 | 1340 | 1340 | 1340 | 1340 | 1340 | 1340 | ... | 1340 | 1340 | 1340 | 1340 | 1340 | 1340 | 1340 | 1340 | 1340 | 12.174071 |
| 4 | BAL-Sarc-1 | 4 | 1033 | 1033 | 1033 | 1033 | 1033 | 1033 | 1033 | 1033 | ... | 1033 | 1033 | 1033 | 1033 | 1033 | 1033 | 1033 | 1033 | 1033 | 9.384937 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 177 | BAL-healthy-10 | 9 | 60 | 60 | 60 | 60 | 60 | 60 | 60 | 60 | ... | 60 | 60 | 60 | 60 | 60 | 60 | 60 | 60 | 60 | 1.615944 |
| 178 | BAL-healthy-10 | 10 | 47 | 47 | 47 | 47 | 47 | 47 | 47 | 47 | ... | 47 | 47 | 47 | 47 | 47 | 47 | 47 | 47 | 47 | 1.265823 |
| 179 | BAL-healthy-10 | 11 | 9 | 9 | 9 | 9 | 9 | 9 | 9 | 9 | ... | 9 | 9 | 9 | 9 | 9 | 9 | 9 | 9 | 9 | 0.242392 |
| 180 | BAL-healthy-10 | 12 | 24 | 24 | 24 | 24 | 24 | 24 | 24 | 24 | ... | 24 | 24 | 24 | 24 | 24 | 24 | 24 | 24 | 24 | 0.646378 |
| 181 | BAL-healthy-10 | 13 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.000000 |
182 rows × 31 columns
Plot of Percentage Total Counts vs leiden_0.5 of all samples
plt.figure(figsize = (10,3))
ax = sns.barplot(x = 'leiden_0.5', y = 'pernew', hue = 'sample', data = counts_05)
ax.set(xlabel='leiden_0.5',
ylabel='Percenatge_total_counts')
plt.legend(bbox_to_anchor=(1.4,0.8))
<matplotlib.legend.Legend at 0x7f8eb28fff10>
Percentage Checking of Total Counts checking inside leiden_0.4
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
# Assuming 'merged' is your DataFrame
countsp_04 = adata.obs.groupby(['sample', 'leiden_0.4']).count().reset_index()
def map_per(row):
samp, y = row['sample'], row['total_counts']
totp04 = countsp_04.loc[countsp_04['sample'] == samp, 'total_counts'].sum()
return (y / totp04) * 100 if totp04 != 0 else 0
countsp_04['pernewpo4'] = countsp_04.apply(map_per, axis=1)
# Display the resulting DataFrame
display(countsp_04)
| sample | leiden_0.4 | type | percent_chrY | XIST-counts | n_genes | n_genes_by_counts | total_counts | total_counts_mt | pct_counts_mt | ... | batch | doublet_scores | predicted_doublets | doublet_info | umap_density_sample | umap_density_leiden | leiden_1.0 | leiden_0.8 | leiden_0.5 | pernewpo4 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | BAL-Sarc-1 | 0 | 2281 | 2281 | 2281 | 2281 | 2281 | 2281 | 2281 | 2281 | ... | 2281 | 2281 | 2281 | 2281 | 2281 | 2281 | 2281 | 2281 | 2281 | 20.723176 |
| 1 | BAL-Sarc-1 | 1 | 2048 | 2048 | 2048 | 2048 | 2048 | 2048 | 2048 | 2048 | ... | 2048 | 2048 | 2048 | 2048 | 2048 | 2048 | 2048 | 2048 | 2048 | 18.606341 |
| 2 | BAL-Sarc-1 | 2 | 1742 | 1742 | 1742 | 1742 | 1742 | 1742 | 1742 | 1742 | ... | 1742 | 1742 | 1742 | 1742 | 1742 | 1742 | 1742 | 1742 | 1742 | 15.826292 |
| 3 | BAL-Sarc-1 | 3 | 1521 | 1521 | 1521 | 1521 | 1521 | 1521 | 1521 | 1521 | ... | 1521 | 1521 | 1521 | 1521 | 1521 | 1521 | 1521 | 1521 | 1521 | 13.818479 |
| 4 | BAL-Sarc-1 | 4 | 945 | 945 | 945 | 945 | 945 | 945 | 945 | 945 | ... | 945 | 945 | 945 | 945 | 945 | 945 | 945 | 945 | 945 | 8.585446 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 164 | BAL-healthy-10 | 8 | 58 | 58 | 58 | 58 | 58 | 58 | 58 | 58 | ... | 58 | 58 | 58 | 58 | 58 | 58 | 58 | 58 | 58 | 1.562079 |
| 165 | BAL-healthy-10 | 9 | 47 | 47 | 47 | 47 | 47 | 47 | 47 | 47 | ... | 47 | 47 | 47 | 47 | 47 | 47 | 47 | 47 | 47 | 1.265823 |
| 166 | BAL-healthy-10 | 10 | 24 | 24 | 24 | 24 | 24 | 24 | 24 | 24 | ... | 24 | 24 | 24 | 24 | 24 | 24 | 24 | 24 | 24 | 0.646378 |
| 167 | BAL-healthy-10 | 11 | 9 | 9 | 9 | 9 | 9 | 9 | 9 | 9 | ... | 9 | 9 | 9 | 9 | 9 | 9 | 9 | 9 | 9 | 0.242392 |
| 168 | BAL-healthy-10 | 12 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.000000 |
169 rows × 31 columns
Plot of Percentage Total Counts vs leiden_0.4 of all samples
plt.figure(figsize = (10,3))
ax = sns.barplot(x = 'leiden_0.4', y = 'pernewpo4', hue = 'sample', data = countsp_04)
ax.set(xlabel='leiden_0.4',
ylabel='Percenatge_total_counts')
plt.legend(bbox_to_anchor=(1.4,0.8))
<matplotlib.legend.Legend at 0x7f8e4a3a70d0>
Percentage Checking of Total Counts checking inside leiden_0.8
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
# Assuming 'merged' is your DataFrame
counts_08 = adata.obs.groupby(['sample', 'leiden_0.8']).count().reset_index()
def map_per(row):
samp, y = row['sample'], row['total_counts']
tot08 = counts_08.loc[counts_08['sample'] == samp, 'total_counts'].sum()
return (y / tot08) * 100 if tot08 != 0 else 0
counts_08['pernewpo8'] = counts_08.apply(map_per, axis=1)
# Display the resulting DataFrame
display(counts_08)
| sample | leiden_0.8 | type | percent_chrY | XIST-counts | n_genes | n_genes_by_counts | total_counts | total_counts_mt | pct_counts_mt | ... | batch | doublet_scores | predicted_doublets | doublet_info | umap_density_sample | umap_density_leiden | leiden_1.0 | leiden_0.5 | leiden_0.4 | pernewpo8 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | BAL-Sarc-1 | 0 | 1593 | 1593 | 1593 | 1593 | 1593 | 1593 | 1593 | 1593 | ... | 1593 | 1593 | 1593 | 1593 | 1593 | 1593 | 1593 | 1593 | 1593 | 14.472608 |
| 1 | BAL-Sarc-1 | 1 | 1146 | 1146 | 1146 | 1146 | 1146 | 1146 | 1146 | 1146 | ... | 1146 | 1146 | 1146 | 1146 | 1146 | 1146 | 1146 | 1146 | 1146 | 10.411556 |
| 2 | BAL-Sarc-1 | 2 | 1251 | 1251 | 1251 | 1251 | 1251 | 1251 | 1251 | 1251 | ... | 1251 | 1251 | 1251 | 1251 | 1251 | 1251 | 1251 | 1251 | 1251 | 11.365495 |
| 3 | BAL-Sarc-1 | 3 | 826 | 826 | 826 | 826 | 826 | 826 | 826 | 826 | ... | 826 | 826 | 826 | 826 | 826 | 826 | 826 | 826 | 826 | 7.504315 |
| 4 | BAL-Sarc-1 | 4 | 1009 | 1009 | 1009 | 1009 | 1009 | 1009 | 1009 | 1009 | ... | 1009 | 1009 | 1009 | 1009 | 1009 | 1009 | 1009 | 1009 | 1009 | 9.166894 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 216 | BAL-healthy-10 | 12 | 58 | 58 | 58 | 58 | 58 | 58 | 58 | 58 | ... | 58 | 58 | 58 | 58 | 58 | 58 | 58 | 58 | 58 | 1.562079 |
| 217 | BAL-healthy-10 | 13 | 49 | 49 | 49 | 49 | 49 | 49 | 49 | 49 | ... | 49 | 49 | 49 | 49 | 49 | 49 | 49 | 49 | 49 | 1.319688 |
| 218 | BAL-healthy-10 | 14 | 24 | 24 | 24 | 24 | 24 | 24 | 24 | 24 | ... | 24 | 24 | 24 | 24 | 24 | 24 | 24 | 24 | 24 | 0.646378 |
| 219 | BAL-healthy-10 | 15 | 9 | 9 | 9 | 9 | 9 | 9 | 9 | 9 | ... | 9 | 9 | 9 | 9 | 9 | 9 | 9 | 9 | 9 | 0.242392 |
| 220 | BAL-healthy-10 | 16 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.000000 |
221 rows × 31 columns
Plot of Percentage Total Counts vs leiden_0.8 of all samples
plt.figure(figsize = (10,3))
ax = sns.barplot(x = 'leiden_0.8', y = 'pernewpo8', hue = 'sample', data = counts_08)
ax.set(xlabel='leiden_0.8',
ylabel='Percenatge_total_counts')
plt.legend(bbox_to_anchor=(1.4,0.8))
<matplotlib.legend.Legend at 0x7f8e38fc0220>
#import scipy io package
from scipy import io
save_file = '/home/jana/Updated_SCANPY_QC_filtered_BAL_for_Sarcoidosis.h5ad'
adata.write_h5ad(save_file)
#saving into CSV data
counts.to_csv('/home/jana/Merged_BAL_new_object_metadata.csv')
Delete all samples variables with merged object
del(adata, balsarc1, balsarc2, balsarc3, balhealthy1, balhealthy2, balhealthy3, balhealthy4, balhealthy5, balhealthy6, balhealthy7, balhealthy8, balhealthy9, balhealthy10)